{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "a2c34d0a",
   "metadata": {},
   "source": [
    "# 缺失值的处理\n",
    "\n",
    "Pandas模块支持对有缺失值的数据进行相关处理。导入相关模块："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f1b480fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "05b42ebe",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4c51b3b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(np.random.randn(4, 5), columns=list(\"abcde\"))"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "67f0bc11",
   "metadata": {},
   "source": [
    "DataFrame的行标记可以通过赋值修改："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "518a876e",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.index = pd.date_range(\"20000101\", periods=4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "728df367",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.iloc[[2, 3], [3, 4]] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "e1e76f1b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>e</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2000-01-01</th>\n",
       "      <td>2.428019</td>\n",
       "      <td>2.317165</td>\n",
       "      <td>1.186934</td>\n",
       "      <td>-0.208601</td>\n",
       "      <td>-1.278155</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-02</th>\n",
       "      <td>1.040802</td>\n",
       "      <td>-2.269797</td>\n",
       "      <td>0.580195</td>\n",
       "      <td>-0.241620</td>\n",
       "      <td>-0.620173</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-03</th>\n",
       "      <td>-2.392500</td>\n",
       "      <td>0.298097</td>\n",
       "      <td>0.407541</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-04</th>\n",
       "      <td>-0.945231</td>\n",
       "      <td>-0.258435</td>\n",
       "      <td>-0.290456</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   a         b         c         d         e\n",
       "2000-01-01  2.428019  2.317165  1.186934 -0.208601 -1.278155\n",
       "2000-01-02  1.040802 -2.269797  0.580195 -0.241620 -0.620173\n",
       "2000-01-03 -2.392500  0.298097  0.407541       NaN       NaN\n",
       "2000-01-04 -0.945231 -0.258435 -0.290456       NaN       NaN"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "0ef742f1",
   "metadata": {},
   "source": [
    "可以使用`.dropna()`方法去掉所有包含缺失值的行，得到一个新的DataFrame："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "ae63ef76",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>e</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2000-01-01</th>\n",
       "      <td>2.428019</td>\n",
       "      <td>2.317165</td>\n",
       "      <td>1.186934</td>\n",
       "      <td>-0.208601</td>\n",
       "      <td>-1.278155</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-02</th>\n",
       "      <td>1.040802</td>\n",
       "      <td>-2.269797</td>\n",
       "      <td>0.580195</td>\n",
       "      <td>-0.241620</td>\n",
       "      <td>-0.620173</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   a         b         c         d         e\n",
       "2000-01-01  2.428019  2.317165  1.186934 -0.208601 -1.278155\n",
       "2000-01-02  1.040802 -2.269797  0.580195 -0.241620 -0.620173"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dropna(how=\"any\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "e9d6fc25",
   "metadata": {},
   "source": [
    "how参数设为“any”表示只要该行有缺失值，就会被去掉，如果换成“all”，则表示只有该行全部缺失时才去掉。.dropna()方法还可以通过axis参数指定对行还是对列进行操作，默认值为0，即对行；如果要对列进行操作，可以将axis参数设为1："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "5a901cd3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2000-01-01</th>\n",
       "      <td>2.428019</td>\n",
       "      <td>2.317165</td>\n",
       "      <td>1.186934</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-02</th>\n",
       "      <td>1.040802</td>\n",
       "      <td>-2.269797</td>\n",
       "      <td>0.580195</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-03</th>\n",
       "      <td>-2.392500</td>\n",
       "      <td>0.298097</td>\n",
       "      <td>0.407541</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-04</th>\n",
       "      <td>-0.945231</td>\n",
       "      <td>-0.258435</td>\n",
       "      <td>-0.290456</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   a         b         c\n",
       "2000-01-01  2.428019  2.317165  1.186934\n",
       "2000-01-02  1.040802 -2.269797  0.580195\n",
       "2000-01-03 -2.392500  0.298097  0.407541\n",
       "2000-01-04 -0.945231 -0.258435 -0.290456"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dropna(axis=1, how=\"any\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "6d2601b1",
   "metadata": {},
   "source": [
    "也可以用`.fill_na()`方法为缺失值补上默认值："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "6eea014d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>e</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2000-01-01</th>\n",
       "      <td>2.428019</td>\n",
       "      <td>2.317165</td>\n",
       "      <td>1.186934</td>\n",
       "      <td>-0.208601</td>\n",
       "      <td>-1.278155</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-02</th>\n",
       "      <td>1.040802</td>\n",
       "      <td>-2.269797</td>\n",
       "      <td>0.580195</td>\n",
       "      <td>-0.241620</td>\n",
       "      <td>-0.620173</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-03</th>\n",
       "      <td>-2.392500</td>\n",
       "      <td>0.298097</td>\n",
       "      <td>0.407541</td>\n",
       "      <td>100.000000</td>\n",
       "      <td>100.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-04</th>\n",
       "      <td>-0.945231</td>\n",
       "      <td>-0.258435</td>\n",
       "      <td>-0.290456</td>\n",
       "      <td>100.000000</td>\n",
       "      <td>100.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   a         b         c           d           e\n",
       "2000-01-01  2.428019  2.317165  1.186934   -0.208601   -1.278155\n",
       "2000-01-02  1.040802 -2.269797  0.580195   -0.241620   -0.620173\n",
       "2000-01-03 -2.392500  0.298097  0.407541  100.000000  100.000000\n",
       "2000-01-04 -0.945231 -0.258435 -0.290456  100.000000  100.000000"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.fillna(value=100)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "id": "966df83b",
   "metadata": {},
   "source": [
    "这两种方法都返回一个新的DataFrame对象，对原对象不产生影响："
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e5244212",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>a</th>\n",
       "      <th>b</th>\n",
       "      <th>c</th>\n",
       "      <th>d</th>\n",
       "      <th>e</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2000-01-01</th>\n",
       "      <td>2.428019</td>\n",
       "      <td>2.317165</td>\n",
       "      <td>1.186934</td>\n",
       "      <td>-0.208601</td>\n",
       "      <td>-1.278155</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-02</th>\n",
       "      <td>1.040802</td>\n",
       "      <td>-2.269797</td>\n",
       "      <td>0.580195</td>\n",
       "      <td>-0.241620</td>\n",
       "      <td>-0.620173</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-03</th>\n",
       "      <td>-2.392500</td>\n",
       "      <td>0.298097</td>\n",
       "      <td>0.407541</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2000-01-04</th>\n",
       "      <td>-0.945231</td>\n",
       "      <td>-0.258435</td>\n",
       "      <td>-0.290456</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   a         b         c         d         e\n",
       "2000-01-01  2.428019  2.317165  1.186934 -0.208601 -1.278155\n",
       "2000-01-02  1.040802 -2.269797  0.580195 -0.241620 -0.620173\n",
       "2000-01-03 -2.392500  0.298097  0.407541       NaN       NaN\n",
       "2000-01-04 -0.945231 -0.258435 -0.290456       NaN       NaN"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c084ff1c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
